library(dplyr)
library(scater)
library(SeuratDisk)
library(Seurat)
library(ggplot2)
library(RColorBrewer)
library(readr)
real_validation <- LoadH5Seurat("BrainSmall-realData_2kTest.h5seurat") # real data
scgan_validation <- LoadH5Seurat("scGAN_2kTest.h5seurat") # scGAN data
activa_validation <- LoadH5Seurat("20kBrainSmall-1997Generated_10RecLoss_Epoch600.h5seurat") # ACTIVA data
# Changing the barcodes in ACTIVA validation set to be the same as the real validation set
rownames(activa_validation@meta.data) <- rownames(real_validation@meta.data)
# Generate dataset labels for the metadata
real_labs <- rep("Real (Test Set)", nrow(real_validation@meta.data))
activa_labs <- rep("ACTIVA", nrow(activa_validation@meta.data))
scgan_labs <- rep("scGAN", nrow(scgan_validation@meta.data))
real_validation <- AddMetaData(real_validation,
metadata = real_labs,
col.name = "dataset_label") # adding dataset label to metadata
scgan_validation <- AddMetaData(scgan_validation,
metadata = scgan_labs,
col.name = "dataset_label") # adding dataset label to metadata
activa_validation <- AddMetaData(activa_validation,
metadata = activa_labs,
col.name = "dataset_label") # adding dataset label to metadata
# 3000 highly variable genes identified using variance stabilizing transformation - vst
real_validation <- FindVariableFeatures(object = real_validation,
nfeatures = 3000,
selection.method = "vst")
scgan_validation <- FindVariableFeatures(object = scgan_validation,
nfeatures = 3000,
selection.method = "vst")
activa_validation <- FindVariableFeatures(object = activa_validation,
nfeatures = 3000,
selection.method = "vst")
val_list <- list(real_validation, scgan_validation, activa_validation)
#Integration
validation.anchors <-
FindIntegrationAnchors(object.list = val_list,
dims = 1:30,
anchor.features = 3000)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Computing 3000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4873 anchors
## Filtering anchors
## Retained 1851 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4774 anchors
## Filtering anchors
## Retained 1949 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 5844 anchors
## Filtering anchors
## Retained 1708 anchors
validation.int <- IntegrateData(anchorset = validation.anchors,
dims = 1:30)
## Merging dataset 3 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 2 into 1 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Adding a command log without an assay associated with it
DefaultAssay(validation.int) <- "integrated"
# Scaling
validation.int <- ScaleData(validation.int)
## Centering and scaling data matrix
# Dim reduction w/PCA
validation.int <- RunPCA(validation.int)
## PC_ 1
## Positive: Cd24a, Tubb3, Rtn1, Nsg1, Sox11, Mllt11, Tagln3, Sox4, Elavl3, Igfbpl1
## 6330403K07Rik, Dpysl3, Dcx, Neurod2, Stmn3, Nsg2, Uchl1, Basp1, Neurod6, Map1b
## Nfix, Stmn4, Kif5c, Hist3h2ba, Fnbp1l, Cdk5r1, Gria2, Mapt, Thra, Nfib
## Negative: Igfbp7, Vamp8, Sparcl1, Slc2a1, Gng11, Sparc, Nfkbia, Ramp2, Slc7a5, Zfp36l1
## Ecscr, Itm2a, Ifitm3, AU021092, Anxa3, Esam, Sepp1, Rhoc, Cd34, Hes1
## Cyba, Ostf1, Serpinh1, Col4a2, Slc40a1, Col4a1, B2m, Tgfb1, Eng, Arhgap29
## PC_ 2
## Positive: Dbi, Vim, Gng5, Phgdh, Slc1a3, Mfge8, Qk, Ddah1, Pea15a, Ndrg2
## Mt3, 1810037I17Rik, Fabp7, Plpp3, Prdx1, Aldoc, Ckb, H2afz, Slc9a3r1, Notch1
## Hmgb2, Atp1a2, Prdx6, Asrgl1, Gnai2, H2afv, Mid1ip1, Oat, Psph, Sox9
## Negative: Tubb3, Mllt11, Nsg1, Stmn3, Rtn1, Cd24a, Mapt, Stmn2, Stmn4, Elavl3
## Dpysl3, Map1b, Dcx, Uchl1, Tmsb10, Sox11, Thra, Basp1, Nsg2, Neurod2
## Gria2, Cdk5r1, Ly6h, Neurod6, 6330403K07Rik, Nrep, Sox4, Gng2, Tagln3, Myt1l
## PC_ 3
## Positive: Hmgb2, H2afz, Cdca3, Lockd, Pbk, Ccna2, Racgap1, Birc5, H2afv, Cks2
## Smc2, Hmgn2, Cenpf, Smc4, Cdca8, Ube2c, Spc25, Mki67, Top2a, Spc24
## Nusap1, Prc1, Tpx2, Cenpa, Cdc20, Cenpe, Ccnb1, Phgdh, Tmpo, Cdkn2c
## Negative: Igfbp7, Bsg, Sparc, Cldn5, Col4a2, Gng11, Ramp2, Col4a1, Ecscr, Esam
## Slc2a1, Eng, Slc7a5, Itm2a, Slc3a2, Sparcl1, Eva1b, Vwa1, Kdr, Egfl7
## Fth1, Phlda1, Serpinh1, Pglyrp1, Ctla2a, Ppic, Tfpi, Sept4, Slc40a1, Myl6
## PC_ 4
## Positive: Ube2s, Spc25, Top2a, Birc5, Ube2c, Cks2, Hmgb2, Hmgn2, Nusap1, Smc2
## Cdca8, Pbk, H2afz, Smc4, Cdca3, Ccna2, Cdk1, Cks1b, Tmpo, Prc1
## H2afx, Tpx2, Ccnb1, Cenpe, Mki67, H2afv, Racgap1, Lockd, Cenpf, Cenpa
## Negative: Aldoc, Ptprz1, Cspg5, Fabp7, Mt3, Slc1a3, Dbi, Ttyh1, Ndrg2, Hmgcs1
## Plpp3, Tnc, Hes5, Ccdc80, Pdpn, Mid1ip1, Mmd2, Luzp2, Fjx1, Pea15a
## Scd2, Tst, Cd9, Tspan7, Ddah1, Mlc1, Fgfbp3, Glud1, Vcam1, Qk
## PC_ 5
## Positive: Meg3, Nrxn3, Tubb2a, Dlx2, Gng2, Stmn2, Arx, Dlx6os1, Dlx1, Thra
## Syt1, Ly6h, Ina, Dlx5, Sp9, Mef2c, Tmsb10, Gad2, Gap43, Map1b
## Xist, Stmn3, Gm13889, Mapt, Maf, Pcsk1n, Rpp25, Gad1, Gria1, Nxph1
## Negative: Igfbpl1, Mdk, Ppp2r2b, Lhx2, Eomes, Pantr1, Tsc22d1, Nfix, Plekhf2, Unc5d
## Nfib, Pou3f2, Ezr, Itm2b, Mfap4, Rnd2, Neurod2, Sox11, Igsf8, Cdk2ap1
## Pam, Sstr2, Tagln3, Ddah2, Mllt3, Dhrs4, Ttc28, Plcb1, Rasgef1b, Neurod6
ElbowPlot(validation.int,
ndims = 40)
# Jackstraw plot
# Takes a bit of time to run
validation.int <- JackStraw(validation.int,
dims = 50)
validation.int <- ScoreJackStraw(validation.int,
dims = 1:50)
JackStrawPlot(validation.int,
dims = 1:50)
# Clustering
usepcs_int <- 1:50
validation.int <- FindNeighbors(object = validation.int,
dims = usepcs_int)
## Computing nearest neighbor graph
## Computing SNN
validation.int <- FindClusters(object = validation.int,
resolution = seq(0.10, 2, 0.10))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9283
## Number of communities: 5
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8849
## Number of communities: 6
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8630
## Number of communities: 8
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8466
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8328
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8193
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8062
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7937
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7839
## Number of communities: 15
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7745
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7652
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7567
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7488
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7414
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7342
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7267
## Number of communities: 20
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7198
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7136
## Number of communities: 22
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7082
## Number of communities: 23
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5991
## Number of edges: 287554
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7028
## Number of communities: 26
## Elapsed time: 0 seconds
grep("res", colnames(validation.int@meta.data), value = TRUE) %>%
purrr::map_chr(~ paste(.x, "--> clusters generated:", length(unique(
validation.int@meta.data[, .x]))))
## [1] "integrated_snn_res.0.1 --> clusters generated: 5"
## [2] "integrated_snn_res.0.2 --> clusters generated: 6"
## [3] "integrated_snn_res.0.3 --> clusters generated: 8"
## [4] "integrated_snn_res.0.4 --> clusters generated: 11"
## [5] "integrated_snn_res.0.5 --> clusters generated: 12"
## [6] "integrated_snn_res.0.6 --> clusters generated: 13"
## [7] "integrated_snn_res.0.7 --> clusters generated: 13"
## [8] "integrated_snn_res.0.8 --> clusters generated: 14"
## [9] "integrated_snn_res.0.9 --> clusters generated: 15"
## [10] "integrated_snn_res.1 --> clusters generated: 17"
## [11] "integrated_snn_res.1.1 --> clusters generated: 18"
## [12] "integrated_snn_res.1.2 --> clusters generated: 17"
## [13] "integrated_snn_res.1.3 --> clusters generated: 18"
## [14] "integrated_snn_res.1.4 --> clusters generated: 21"
## [15] "integrated_snn_res.1.5 --> clusters generated: 21"
## [16] "integrated_snn_res.1.6 --> clusters generated: 20"
## [17] "integrated_snn_res.1.7 --> clusters generated: 21"
## [18] "integrated_snn_res.1.8 --> clusters generated: 22"
## [19] "integrated_snn_res.1.9 --> clusters generated: 23"
## [20] "integrated_snn_res.2 --> clusters generated: 26"
Idents(validation.int) <- "integrated_snn_res.0.3"
# Dimension reduction
validation.int <- RunTSNE(object = validation.int,
reduction.use = "pca",
dims = usepcs_int,
do.fast = TRUE)
validation.int <- RunUMAP(validation.int,
reduction = "pca",
dims = usepcs_int)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:04:51 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:04:51 Read 5991 rows and found 50 numeric columns
## 11:04:51 Using Annoy for neighbor search, n_neighbors = 30
## 11:04:51 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:04:52 Writing NN index file to temp file /var/folders/r9/z4byyk895zbc4qs07n05cph80000gn/T//Rtmpgf5kfA/filea895436966b8
## 11:04:52 Searching Annoy index using 1 thread, search_k = 3000
## 11:04:53 Annoy recall = 100%
## 11:04:54 Commencing smooth kNN distance calibration using 1 thread
## 11:04:55 Initializing from normalized Laplacian + noise
## 11:04:55 Commencing optimization for 500 epochs, with 273440 positive edges
## 11:05:04 Optimization finished
# Saving the datasets in different formats for interoperability with python packages
save(validation.int, file = "data/final_brainsmall_val_int_clust.RData")
SaveH5Seurat(validation.int,
filename = "final_brainsmall_val_int_clust.h5Seurat",
overwrite = FALSE)
Convert("final_brainsmall_val_int_clust.h5Seurat", dest = "h5ad")
# scater is a toolkit like Seurat but uses a different data structure (Single Cell Experiment (SCE))
# First will use seurat to generate the SCE data by calling the as.SingleCellExperiment() on our Seurat object
# Removing negative expression values before converting to SCE object
cleaned_Hmgb2 <- WhichCells(validation.int,
expression = Hmgb2 >= 0)
validation.int <- subset(validation.int, cells = cleaned_Hmgb2)
# Converting the seurat object into a SCE object
val_int_sce <- as.SingleCellExperiment(validation.int)
names(assays(val_int_sce)) <- "counts" # making sure assay name is correct for expression values
# Creating an object "logcounts" which houses log2 transformed count values
logcounts(val_int_sce) <- log2(counts(val_int_sce)+1)
## Warning in eval(call, parent.frame()): NaNs produced
dim(logcounts(val_int_sce))
## [1] 3000 5953
p_clusters <-
plotUMAP(val_int_sce, colour_by = "ident", other_fields = "dataset_label") +
facet_wrap( ~ dataset_label) +
theme(legend.title = element_blank(),
strip.text.x = element_text(face = "bold"))
p_clusters
Clusters
ggsave(
filename = "plots/integrated/brainsmall_final_validation_scater_umap.png",
dpi = 320,
units = "in",
width = 8,
height = 4
)
p_Hmgb2 <- plotExpression(
val_int_sce,
features = c("Hmgb2"),
colour_by = "ident",
x = "ident",
other_fields = "dataset_label",
exprs_values = "logcounts"
) +
facet_wrap( ~ dataset_label) +
xlab("Cluster") +
ggtitle("Marker: Hmgb2") +
theme(legend.title = element_blank(),
strip.text.x = element_text(face = "bold"))
p_Hmgb2
Violin Plots
ggsave(
filename = "plots/integrated/brainsmall_final_validation_scater_violin_Hmgb2_logcounts.png",
dpi = 320,
units = "in",
width = 8,
height = 4
)
p_density_Hmgb2 <-
plotUMAP(val_int_sce, colour_by = "Hmgb2", other_fields = "dataset_label") +
facet_wrap( ~ dataset_label) +
scale_fill_viridis_c(option = "B") +
ggtitle("Marker: Hmgb2") +
theme(legend.title = element_blank(),
strip.text.x = element_text(face = "bold"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p_density_Hmgb2
Expression Plot
ggsave(
filename = "plots/integrated/brainsmall_final_validation_scater_UMAPexpression_Hmgb2.png",
dpi = 320,
units = "in",
width = 8,
height = 4
)